In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
In [2]:
data = r"C:\Users\laxma\Downloads\Employee.csv"
data = pd.read_csv(data)
In [3]:
data.shape
Out[3]:
(4653, 9)
In [4]:
data.head()
Out[4]:
Education JoiningYear City PaymentTier Age Gender EverBenched ExperienceInCurrentDomain LeaveOrNot
0 Bachelors 2017 Bangalore 3 34 Male No 0 0
1 Bachelors 2013 Pune 1 28 Female No 3 1
2 Bachelors 2014 New Delhi 3 38 Female No 2 0
3 Masters 2016 Bangalore 3 27 Male No 5 1
4 Masters 2017 Pune 3 24 Male Yes 2 1
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4653 entries, 0 to 4652
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4653 non-null   object
 1   JoiningYear                4653 non-null   int64 
 2   City                       4653 non-null   object
 3   PaymentTier                4653 non-null   int64 
 4   Age                        4653 non-null   int64 
 5   Gender                     4653 non-null   object
 6   EverBenched                4653 non-null   object
 7   ExperienceInCurrentDomain  4653 non-null   int64 
 8   LeaveOrNot                 4653 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 327.3+ KB
In [6]:
data['LeaveOrNot'].value_counts()
Out[6]:
0    3053
1    1600
Name: LeaveOrNot, dtype: int64
In [7]:
data.duplicated().sum()
Out[7]:
1889
In [8]:
data.drop_duplicates(inplace=True)
In [9]:
data.duplicated().sum()
Out[9]:
0
In [10]:
data.describe()
Out[10]:
JoiningYear PaymentTier Age ExperienceInCurrentDomain LeaveOrNot
count 2764.000000 2764.000000 2764.000000 2764.000000 2764.000000
mean 2015.090449 2.636035 30.952967 2.644356 0.393632
std 1.885943 0.624001 5.108872 1.610610 0.488643
min 2012.000000 1.000000 22.000000 0.000000 0.000000
25% 2013.000000 2.000000 27.000000 1.000000 0.000000
50% 2015.000000 3.000000 30.000000 2.000000 0.000000
75% 2017.000000 3.000000 35.000000 4.000000 1.000000
max 2018.000000 3.000000 41.000000 7.000000 1.000000
In [11]:
data.columns
Out[11]:
Index(['Education', 'JoiningYear', 'City', 'PaymentTier', 'Age', 'Gender',
       'EverBenched', 'ExperienceInCurrentDomain', 'LeaveOrNot'],
      dtype='object')
In [12]:
#VISUALIZATION
In [13]:
plt.bar(data['JoiningYear'],data['City'])
plt.xticks(rotation=90)
plt.show()
In [14]:
fig=px.violin(data,x='Gender',y='Age',color='Gender')
fig.show()
In [15]:
plt.scatter(data['ExperienceInCurrentDomain'],data['Age'],color='red')
plt.xticks(rotation=90)
plt.show()
In [16]:
plt.figure(figsize=(10,4))
sns.countplot(x='EverBenched', data=data, color='b')
plt.show()
In [17]:
plt.figure(figsize=(10,4))
top_car = data['LeaveOrNot'].value_counts().nlargest(10)
sns.countplot(y=data.LeaveOrNot,color='red')
Out[17]:
<AxesSubplot:xlabel='count', ylabel='LeaveOrNot'>
In [18]:
sns.lineplot(x='City', y='LeaveOrNot', data=data)
Out[18]:
<AxesSubplot:xlabel='City', ylabel='LeaveOrNot'>
In [19]:
sns.barplot(data['PaymentTier'],data['Education'],color='cyan')
plt.xticks(rotation=90)
plt.show()
In [20]:
sns.countplot(x='JoiningYear',data=data)
Out[20]:
<AxesSubplot:xlabel='JoiningYear', ylabel='count'>
In [21]:
sns.boxplot(x='ExperienceInCurrentDomain',y='EverBenched',data=data)
Out[21]:
<AxesSubplot:xlabel='ExperienceInCurrentDomain', ylabel='EverBenched'>
In [22]:
sns.violinplot(x='EverBenched',y='LeaveOrNot',data=data)
Out[22]:
<AxesSubplot:xlabel='EverBenched', ylabel='LeaveOrNot'>
In [23]:
#MODEL BUILDING
In [24]:
categorical = [col for col in data.columns if data[col].dtypes == 'O']
categorical
Out[24]:
['Education', 'City', 'Gender', 'EverBenched']
In [25]:
import category_encoders as ce
encoder = ce.OneHotEncoder(cols=['Education', 'City','Gender','EverBenched'])

data = encoder.fit_transform(data)
data.head()
Out[25]:
Education_1 Education_2 Education_3 JoiningYear City_1 City_2 City_3 PaymentTier Age Gender_1 Gender_2 EverBenched_1 EverBenched_2 ExperienceInCurrentDomain LeaveOrNot
0 1 0 0 2017 1 0 0 3 34 1 0 1 0 0 0
1 1 0 0 2013 0 1 0 1 28 0 1 1 0 3 1
2 1 0 0 2014 0 0 1 3 38 0 1 1 0 2 0
3 0 1 0 2016 1 0 0 3 27 1 0 1 0 5 1
4 0 1 0 2017 0 1 0 3 24 1 0 0 1 2 1
In [26]:
X = data.drop('LeaveOrNot', axis=1) #independent

y = data['LeaveOrNot'] #dependent
In [27]:
X.head()
Out[27]:
Education_1 Education_2 Education_3 JoiningYear City_1 City_2 City_3 PaymentTier Age Gender_1 Gender_2 EverBenched_1 EverBenched_2 ExperienceInCurrentDomain
0 1 0 0 2017 1 0 0 3 34 1 0 1 0 0
1 1 0 0 2013 0 1 0 1 28 0 1 1 0 3
2 1 0 0 2014 0 0 1 3 38 0 1 1 0 2
3 0 1 0 2016 1 0 0 3 27 1 0 1 0 5
4 0 1 0 2017 0 1 0 3 24 1 0 0 1 2
In [28]:
y.head()
Out[28]:
0    0
1    1
2    0
3    1
4    1
Name: LeaveOrNot, dtype: int64
In [29]:
!pip install XGBoost
Collecting XGBoost
  Using cached xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
Requirement already satisfied: numpy in d:\anaconda files\lib\site-packages (from XGBoost) (1.24.4)
Requirement already satisfied: scipy in d:\anaconda files\lib\site-packages (from XGBoost) (1.9.1)
Installing collected packages: XGBoost
Successfully installed XGBoost-2.0.3
In [30]:
import xgboost as xgb

#define data_dmatrix
data_dmatrix = xgb.DMatrix(data=X, label=y)
In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
In [32]:
from xgboost import XGBClassifier

params = {
           'objective':'binary:logistic',
           'max_depth':4,
           'aplha':10,
           'learning_rate':1.0,
           'n_estimators':100
        }



xgb_clf = XGBClassifier(**params)


xgb_clf.fit(X_train, y_train)
Out[32]:
XGBClassifier(aplha=10, base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=1.0, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, ...)
In [33]:
print(xgb_clf)
XGBClassifier(aplha=10, base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=1.0, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, ...)
In [34]:
y_pred = xgb_clf.predict(X_test)
In [35]:
from sklearn.metrics import accuracy_score
print('XGBoost model accuracy score:{0:0.4f}'. format(accuracy_score(y_test, y_pred)))
XGBoost model accuracy score:0.7518
In [ ]: